In [1]:
pip install shap
Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0) Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.26.4) Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.11.4) Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.5.1) Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.1.4) Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0) Requirement already satisfied: packaging>20.9 in c:\users\chand\anaconda3\lib\site-packages (from shap) (23.1) Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8) Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0) Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6) Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (3.5.0) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [2]:
pip install interpret
Requirement already satisfied: interpret in c:\users\chand\anaconda3\lib\site-packages (0.6.3) Requirement already satisfied: interpret-core==0.6.3 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.6.3) Requirement already satisfied: numpy>=1.11.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.26.4) Requirement already satisfied: scipy>=0.18.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.11.4) Requirement already satisfied: pandas>=0.19.2 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.1.4) Requirement already satisfied: scikit-learn>=0.18.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.5.1) Requirement already satisfied: joblib>=0.11 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.2.0) Requirement already satisfied: aplr>=10.5.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (10.6.0) Requirement already satisfied: dash>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.17.1) Requirement already satisfied: dash-core-components>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.0) Requirement already satisfied: dash-html-components>=1.0.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.0) Requirement already satisfied: dash-table>=4.1.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.0.0) Requirement already satisfied: dash-cytoscape>=0.1.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.0.2) Requirement already satisfied: gevent>=1.3.6 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (24.2.1) Requirement already satisfied: requests>=2.19.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.31.0) Requirement already satisfied: psutil>=5.6.2 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.9.0) Requirement already satisfied: ipykernel>=4.10.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (6.28.0) Requirement already satisfied: ipython>=5.5.0 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.20.0) Requirement already satisfied: plotly>=3.8.1 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.9.0) Requirement already satisfied: SALib>=1.3.3 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.5.0) Requirement already satisfied: shap>=0.28.5 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.46.0) Requirement already satisfied: dill>=0.2.5 in c:\users\chand\anaconda3\lib\site-packages (from interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.3.8) Requirement already satisfied: Flask<3.1,>=1.0.4 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.5) Requirement already satisfied: Werkzeug<3.1 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.3) Requirement already satisfied: importlib-metadata in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (7.0.1) Requirement already satisfied: typing-extensions>=4.1.1 in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.12.2) Requirement already satisfied: retrying in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.3.4) Requirement already satisfied: nest-asyncio in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.6.0) Requirement already satisfied: setuptools in c:\users\chand\anaconda3\lib\site-packages (from dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (68.2.2) Requirement already satisfied: zope.event in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.0) Requirement already satisfied: zope.interface in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.4.0) Requirement already satisfied: greenlet>=3.0rc3 in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.1) Requirement already satisfied: cffi>=1.12.2 in c:\users\chand\anaconda3\lib\site-packages (from gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.16.0) Requirement already satisfied: comm>=0.1.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.2) Requirement already satisfied: debugpy>=1.6.5 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.6.7) Requirement already satisfied: jupyter-client>=6.1.12 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.6.0) Requirement already satisfied: jupyter-core!=5.0.*,>=4.12 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.5.0) Requirement already satisfied: matplotlib-inline>=0.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.1.6) Requirement already satisfied: packaging in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (23.1) Requirement already satisfied: pyzmq>=24 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (25.1.2) Requirement already satisfied: tornado>=6.1 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (6.3.3) Requirement already satisfied: traitlets>=5.4.0 in c:\users\chand\anaconda3\lib\site-packages (from ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.7.1) Requirement already satisfied: decorator in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (5.1.1) Requirement already satisfied: jedi>=0.16 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.18.1) Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.43) Requirement already satisfied: pygments>=2.4.0 in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.15.1) Requirement already satisfied: stack-data in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.0) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.4.6) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2023.3) Requirement already satisfied: tenacity>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from plotly>=3.8.1->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.2.2) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in c:\users\chand\anaconda3\lib\site-packages (from requests>=2.19.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2024.7.4) Requirement already satisfied: matplotlib>=3.5 in c:\users\chand\anaconda3\lib\site-packages (from SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.8.0) Requirement already satisfied: multiprocess in c:\users\chand\anaconda3\lib\site-packages (from SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.70.16) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18.1->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.5.0) Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.65.0) Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.0.8) Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.59.0) Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.2.1) Requirement already satisfied: pycparser in c:\users\chand\anaconda3\lib\site-packages (from cffi>=1.12.2->gevent>=1.3.6->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.21) Requirement already satisfied: Jinja2>=3.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.1.3) Requirement already satisfied: itsdangerous>=2.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.1) Requirement already satisfied: click>=8.0 in c:\users\chand\anaconda3\lib\site-packages (from Flask<3.1,>=1.0.4->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (8.1.7) Requirement already satisfied: parso<0.9.0,>=0.8.0 in c:\users\chand\anaconda3\lib\site-packages (from jedi>=0.16->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.8.3) Requirement already satisfied: platformdirs>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.10.0) Requirement already satisfied: pywin32>=300 in c:\users\chand\anaconda3\lib\site-packages (from jupyter-core!=5.0.*,>=4.12->ipykernel>=4.10.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (305.1) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.4.4) Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib>=3.5->SALib>=1.3.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.0.9) Requirement already satisfied: wcwidth in c:\users\chand\anaconda3\lib\site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.5) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas>=0.19.2->interpret-core==0.6.3->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (1.16.0) Requirement already satisfied: MarkupSafe>=2.1.1 in c:\users\chand\anaconda3\lib\site-packages (from Werkzeug<3.1->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.1.3) Requirement already satisfied: zipp>=0.5 in c:\users\chand\anaconda3\lib\site-packages (from importlib-metadata->dash>=1.0.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (3.17.0) Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap>=0.28.5->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.42.0) Requirement already satisfied: executing in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.8.3) Requirement already satisfied: asttokens in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (2.0.5) Requirement already satisfied: pure-eval in c:\users\chand\anaconda3\lib\site-packages (from stack-data->ipython>=5.5.0->interpret-core[aplr,dash,debug,linear,notebook,plotly,sensitivity,shap]==0.6.3->interpret) (0.2.2) Note: you may need to restart the kernel to use updated packages.
In [3]:
pip install catboost shap
Requirement already satisfied: catboost in c:\users\chand\anaconda3\lib\site-packages (1.2.5)Note: you may need to restart the kernel to use updated packages. Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0) Requirement already satisfied: graphviz in c:\users\chand\anaconda3\lib\site-packages (from catboost) (0.20.3) Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from catboost) (3.8.0) Requirement already satisfied: numpy>=1.16.0 in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.26.4) Requirement already satisfied: pandas>=0.24 in c:\users\chand\anaconda3\lib\site-packages (from catboost) (2.1.4) Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.11.4) Requirement already satisfied: plotly in c:\users\chand\anaconda3\lib\site-packages (from catboost) (5.9.0) Requirement already satisfied: six in c:\users\chand\anaconda3\lib\site-packages (from catboost) (1.16.0) Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.5.1) Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0) Requirement already satisfied: packaging>20.9 in c:\users\chand\anaconda3\lib\site-packages (from shap) (23.1) Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8) Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0) Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas>=0.24->catboost) (2023.3) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (1.4.4) Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->catboost) (3.0.9) Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from plotly->catboost) (8.2.2) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn->shap) (3.5.0)
In [4]:
pip install lime
Requirement already satisfied: lime in c:\users\chand\anaconda3\lib\site-packages (0.2.0.1) Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from lime) (3.8.0) Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.26.4) Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.11.4) Requirement already satisfied: tqdm in c:\users\chand\anaconda3\lib\site-packages (from lime) (4.65.0) Requirement already satisfied: scikit-learn>=0.18 in c:\users\chand\anaconda3\lib\site-packages (from lime) (1.5.1) Requirement already satisfied: scikit-image>=0.12 in c:\users\chand\anaconda3\lib\site-packages (from lime) (0.22.0) Requirement already satisfied: networkx>=2.8 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (3.1) Requirement already satisfied: pillow>=9.0.1 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (10.2.0) Requirement already satisfied: imageio>=2.27 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2.33.1) Requirement already satisfied: tifffile>=2022.8.12 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (2023.4.12) Requirement already satisfied: packaging>=21 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (23.1) Requirement already satisfied: lazy_loader>=0.3 in c:\users\chand\anaconda3\lib\site-packages (from scikit-image>=0.12->lime) (0.3) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=0.18->lime) (3.5.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (1.4.4) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->lime) (2.8.2) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm->lime) (0.4.6) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [5]:
pip install imodels
Requirement already satisfied: imodels in c:\users\chand\anaconda3\lib\site-packages (1.4.6) Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (from imodels) (3.8.0) Requirement already satisfied: mlxtend>=0.18.0 in c:\users\chand\anaconda3\lib\site-packages (from imodels) (0.23.1) Requirement already satisfied: numpy in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.26.4) Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from imodels) (2.1.4) Requirement already satisfied: requests in c:\users\chand\anaconda3\lib\site-packages (from imodels) (2.31.0) Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.11.4) Requirement already satisfied: scikit-learn>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from imodels) (1.5.1) Requirement already satisfied: tqdm in c:\users\chand\anaconda3\lib\site-packages (from imodels) (4.65.0) Requirement already satisfied: joblib>=0.13.2 in c:\users\chand\anaconda3\lib\site-packages (from mlxtend>=0.18.0->imodels) (1.2.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib->imodels) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->imodels) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->imodels) (2023.3) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn>=1.2.0->imodels) (3.5.0) Requirement already satisfied: charset-normalizer<4,>=2 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (3.4) Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2.0.7) Requirement already satisfied: certifi>=2017.4.17 in c:\users\chand\anaconda3\lib\site-packages (from requests->imodels) (2024.7.4) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm->imodels) (0.4.6) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->imodels) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [6]:
pip install matplotlib shap scikit-learn
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (3.8.0) Requirement already satisfied: shap in c:\users\chand\anaconda3\lib\site-packages (0.46.0) Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: numpy<2,>=1.21 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.26.4) Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (2.8.2) Requirement already satisfied: scipy in c:\users\chand\anaconda3\lib\site-packages (from shap) (1.11.4) Requirement already satisfied: pandas in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.1.4) Requirement already satisfied: tqdm>=4.27.0 in c:\users\chand\anaconda3\lib\site-packages (from shap) (4.65.0) Requirement already satisfied: slicer==0.0.8 in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.0.8) Requirement already satisfied: numba in c:\users\chand\anaconda3\lib\site-packages (from shap) (0.59.0) Requirement already satisfied: cloudpickle in c:\users\chand\anaconda3\lib\site-packages (from shap) (2.2.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) Requirement already satisfied: colorama in c:\users\chand\anaconda3\lib\site-packages (from tqdm>=4.27.0->shap) (0.4.6) Requirement already satisfied: llvmlite<0.43,>=0.42.0dev0 in c:\users\chand\anaconda3\lib\site-packages (from numba->shap) (0.42.0) Requirement already satisfied: pytz>=2020.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3.post1) Requirement already satisfied: tzdata>=2022.1 in c:\users\chand\anaconda3\lib\site-packages (from pandas->shap) (2023.3) Note: you may need to restart the kernel to use updated packages.
In [7]:
pip install matplotlib scikit-learn
Requirement already satisfied: matplotlib in c:\users\chand\anaconda3\lib\site-packages (3.8.0) Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: contourpy>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (4.25.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: numpy<2,>=1.21 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (1.26.4) Requirement already satisfied: packaging>=20.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (23.1) Requirement already satisfied: pillow>=6.2.0 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (10.2.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (3.0.9) Requirement already satisfied: python-dateutil>=2.7 in c:\users\chand\anaconda3\lib\site-packages (from matplotlib) (2.8.2) Requirement already satisfied: scipy>=1.6.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.11.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0) Requirement already satisfied: six>=1.5 in c:\users\chand\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [8]:
pip install -U scikit-learn
Requirement already satisfied: scikit-learn in c:\users\chand\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: numpy>=1.19.5 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.26.4) Requirement already satisfied: scipy>=1.6.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.11.4) Requirement already satisfied: joblib>=1.2.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (1.2.0) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\chand\anaconda3\lib\site-packages (from scikit-learn) (3.5.0) Note: you may need to restart the kernel to use updated packages.
Importing Libraries¶
In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from collections import Counter
from scipy import stats
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')
sns.set()
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score, confusion_matrix
from sklearn.feature_selection import RFE
plt.style.use('ggplot')
from sklearn.decomposition import PCA
import shap
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, confusion_matrix, roc_curve, roc_auc_score
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.metrics import ConfusionMatrixDisplay
Basic Pre-Processing¶
#Loading the dataset¶
In [10]:
bc_data= pd.read_csv("C://Users/chand/Documents/Dissertation/Dataset/Breast_cancer.csv")
bc_data
Out[10]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.30010 | 0.14710 | ... | 25.380 | 17.33 | 184.60 | 2019.0 | 0.16220 | 0.66560 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.08690 | 0.07017 | ... | 24.990 | 23.41 | 158.80 | 1956.0 | 0.12380 | 0.18660 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.19740 | 0.12790 | ... | 23.570 | 25.53 | 152.50 | 1709.0 | 0.14440 | 0.42450 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.24140 | 0.10520 | ... | 14.910 | 26.50 | 98.87 | 567.7 | 0.20980 | 0.86630 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.19800 | 0.10430 | ... | 22.540 | 16.67 | 152.20 | 1575.0 | 0.13740 | 0.20500 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 926424 | M | 21.56 | 22.39 | 142.00 | 1479.0 | 0.11100 | 0.11590 | 0.24390 | 0.13890 | ... | 25.450 | 26.40 | 166.10 | 2027.0 | 0.14100 | 0.21130 | 0.4107 | 0.2216 | 0.2060 | 0.07115 |
| 565 | 926682 | M | 20.13 | 28.25 | 131.20 | 1261.0 | 0.09780 | 0.10340 | 0.14400 | 0.09791 | ... | 23.690 | 38.25 | 155.00 | 1731.0 | 0.11660 | 0.19220 | 0.3215 | 0.1628 | 0.2572 | 0.06637 |
| 566 | 926954 | M | 16.60 | 28.08 | 108.30 | 858.1 | 0.08455 | 0.10230 | 0.09251 | 0.05302 | ... | 18.980 | 34.12 | 126.70 | 1124.0 | 0.11390 | 0.30940 | 0.3403 | 0.1418 | 0.2218 | 0.07820 |
| 567 | 927241 | M | 20.60 | 29.33 | 140.10 | 1265.0 | 0.11780 | 0.27700 | 0.35140 | 0.15200 | ... | 25.740 | 39.42 | 184.60 | 1821.0 | 0.16500 | 0.86810 | 0.9387 | 0.2650 | 0.4087 | 0.12400 |
| 568 | 92751 | B | 7.76 | 24.54 | 47.92 | 181.0 | 0.05263 | 0.04362 | 0.00000 | 0.00000 | ... | 9.456 | 30.37 | 59.16 | 268.6 | 0.08996 | 0.06444 | 0.0000 | 0.0000 | 0.2871 | 0.07039 |
569 rows × 32 columns
In [11]:
bc_data.shape
Out[11]:
(569, 32)
In [12]:
bc_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 dtypes: float64(30), int64(1), object(1) memory usage: 142.4+ KB
In [13]:
# Convert the target variable 'diagnosis' column to numerical values
bc_data['diagnosis'] = bc_data['diagnosis'].map({'M': 1, 'B': 0})
Exploratory Data Analysis¶
#Summary statistics¶
In [14]:
bc_data.describe()
Out[14]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5.690000e+02 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | ... | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 | 569.000000 |
| mean | 3.037183e+07 | 0.372583 | 14.127292 | 19.289649 | 91.969033 | 654.889104 | 0.096360 | 0.104341 | 0.088799 | 0.048919 | ... | 16.269190 | 25.677223 | 107.261213 | 880.583128 | 0.132369 | 0.254265 | 0.272188 | 0.114606 | 0.290076 | 0.083946 |
| std | 1.250206e+08 | 0.483918 | 3.524049 | 4.301036 | 24.298981 | 351.914129 | 0.014064 | 0.052813 | 0.079720 | 0.038803 | ... | 4.833242 | 6.146258 | 33.602542 | 569.356993 | 0.022832 | 0.157336 | 0.208624 | 0.065732 | 0.061867 | 0.018061 |
| min | 8.670000e+03 | 0.000000 | 6.981000 | 9.710000 | 43.790000 | 143.500000 | 0.052630 | 0.019380 | 0.000000 | 0.000000 | ... | 7.930000 | 12.020000 | 50.410000 | 185.200000 | 0.071170 | 0.027290 | 0.000000 | 0.000000 | 0.156500 | 0.055040 |
| 25% | 8.692180e+05 | 0.000000 | 11.700000 | 16.170000 | 75.170000 | 420.300000 | 0.086370 | 0.064920 | 0.029560 | 0.020310 | ... | 13.010000 | 21.080000 | 84.110000 | 515.300000 | 0.116600 | 0.147200 | 0.114500 | 0.064930 | 0.250400 | 0.071460 |
| 50% | 9.060240e+05 | 0.000000 | 13.370000 | 18.840000 | 86.240000 | 551.100000 | 0.095870 | 0.092630 | 0.061540 | 0.033500 | ... | 14.970000 | 25.410000 | 97.660000 | 686.500000 | 0.131300 | 0.211900 | 0.226700 | 0.099930 | 0.282200 | 0.080040 |
| 75% | 8.813129e+06 | 1.000000 | 15.780000 | 21.800000 | 104.100000 | 782.700000 | 0.105300 | 0.130400 | 0.130700 | 0.074000 | ... | 18.790000 | 29.720000 | 125.400000 | 1084.000000 | 0.146000 | 0.339100 | 0.382900 | 0.161400 | 0.317900 | 0.092080 |
| max | 9.113205e+08 | 1.000000 | 28.110000 | 39.280000 | 188.500000 | 2501.000000 | 0.163400 | 0.345400 | 0.426800 | 0.201200 | ... | 36.040000 | 49.540000 | 251.200000 | 4254.000000 | 0.222600 | 1.058000 | 1.252000 | 0.291000 | 0.663800 | 0.207500 |
8 rows × 32 columns
Distribution of Target Variable¶
In [15]:
plt.figure(figsize=(8, 6))
sns.countplot(x='diagnosis', data=bc_data, palette='viridis')
plt.title('Distribution of Diagnosis')
plt.xlabel('Diagnosis (0 = Benign, 1 = Malignant)')
plt.ylabel('Count')
plt.show()
Density Graph to check the trends of data¶
In [16]:
plt.figure(figsize=(20,15))
plotnumber = 1
for column in bc_data:
if plotnumber<=30:
ax = plt.subplot(5,6,plotnumber)
sns.distplot(bc_data[column])
plt.xlabel(column)
plotnumber+=1
plt.tight_layout()
plt.show()
Correlation Analysis¶
In [17]:
bc_data.corr()
Out[17]:
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 1.000000 | 0.039769 | 0.074626 | 0.099770 | 0.073159 | 0.096893 | -0.012968 | 0.000096 | 0.050080 | 0.044158 | ... | 0.082405 | 0.064720 | 0.079986 | 0.107187 | 0.010338 | -0.002968 | 0.023203 | 0.035174 | -0.044224 | -0.029866 |
| diagnosis | 0.039769 | 1.000000 | 0.730029 | 0.415185 | 0.742636 | 0.708984 | 0.358560 | 0.596534 | 0.696360 | 0.776614 | ... | 0.776454 | 0.456903 | 0.782914 | 0.733825 | 0.421465 | 0.590998 | 0.659610 | 0.793566 | 0.416294 | 0.323872 |
| radius_mean | 0.074626 | 0.730029 | 1.000000 | 0.323782 | 0.997855 | 0.987357 | 0.170581 | 0.506124 | 0.676764 | 0.822529 | ... | 0.969539 | 0.297008 | 0.965137 | 0.941082 | 0.119616 | 0.413463 | 0.526911 | 0.744214 | 0.163953 | 0.007066 |
| texture_mean | 0.099770 | 0.415185 | 0.323782 | 1.000000 | 0.329533 | 0.321086 | -0.023389 | 0.236702 | 0.302418 | 0.293464 | ... | 0.352573 | 0.912045 | 0.358040 | 0.343546 | 0.077503 | 0.277830 | 0.301025 | 0.295316 | 0.105008 | 0.119205 |
| perimeter_mean | 0.073159 | 0.742636 | 0.997855 | 0.329533 | 1.000000 | 0.986507 | 0.207278 | 0.556936 | 0.716136 | 0.850977 | ... | 0.969476 | 0.303038 | 0.970387 | 0.941550 | 0.150549 | 0.455774 | 0.563879 | 0.771241 | 0.189115 | 0.051019 |
| area_mean | 0.096893 | 0.708984 | 0.987357 | 0.321086 | 0.986507 | 1.000000 | 0.177028 | 0.498502 | 0.685983 | 0.823269 | ... | 0.962746 | 0.287489 | 0.959120 | 0.959213 | 0.123523 | 0.390410 | 0.512606 | 0.722017 | 0.143570 | 0.003738 |
| smoothness_mean | -0.012968 | 0.358560 | 0.170581 | -0.023389 | 0.207278 | 0.177028 | 1.000000 | 0.659123 | 0.521984 | 0.553695 | ... | 0.213120 | 0.036072 | 0.238853 | 0.206718 | 0.805324 | 0.472468 | 0.434926 | 0.503053 | 0.394309 | 0.499316 |
| compactness_mean | 0.000096 | 0.596534 | 0.506124 | 0.236702 | 0.556936 | 0.498502 | 0.659123 | 1.000000 | 0.883121 | 0.831135 | ... | 0.535315 | 0.248133 | 0.590210 | 0.509604 | 0.565541 | 0.865809 | 0.816275 | 0.815573 | 0.510223 | 0.687382 |
| concavity_mean | 0.050080 | 0.696360 | 0.676764 | 0.302418 | 0.716136 | 0.685983 | 0.521984 | 0.883121 | 1.000000 | 0.921391 | ... | 0.688236 | 0.299879 | 0.729565 | 0.675987 | 0.448822 | 0.754968 | 0.884103 | 0.861323 | 0.409464 | 0.514930 |
| concave points_mean | 0.044158 | 0.776614 | 0.822529 | 0.293464 | 0.850977 | 0.823269 | 0.553695 | 0.831135 | 0.921391 | 1.000000 | ... | 0.830318 | 0.292752 | 0.855923 | 0.809630 | 0.452753 | 0.667454 | 0.752399 | 0.910155 | 0.375744 | 0.368661 |
| symmetry_mean | -0.022114 | 0.330499 | 0.147741 | 0.071401 | 0.183027 | 0.151293 | 0.557775 | 0.602641 | 0.500667 | 0.462497 | ... | 0.185728 | 0.090651 | 0.219169 | 0.177193 | 0.426675 | 0.473200 | 0.433721 | 0.430297 | 0.699826 | 0.438413 |
| fractal_dimension_mean | -0.052511 | -0.012838 | -0.311631 | -0.076437 | -0.261477 | -0.283110 | 0.584792 | 0.565369 | 0.336783 | 0.166917 | ... | -0.253691 | -0.051269 | -0.205151 | -0.231854 | 0.504942 | 0.458798 | 0.346234 | 0.175325 | 0.334019 | 0.767297 |
| radius_se | 0.143048 | 0.567134 | 0.679090 | 0.275869 | 0.691765 | 0.732562 | 0.301467 | 0.497473 | 0.631925 | 0.698050 | ... | 0.715065 | 0.194799 | 0.719684 | 0.751548 | 0.141919 | 0.287103 | 0.380585 | 0.531062 | 0.094543 | 0.049559 |
| texture_se | -0.007526 | -0.008303 | -0.097317 | 0.386358 | -0.086761 | -0.066280 | 0.068406 | 0.046205 | 0.076218 | 0.021480 | ... | -0.111690 | 0.409003 | -0.102242 | -0.083195 | -0.073658 | -0.092439 | -0.068956 | -0.119638 | -0.128215 | -0.045655 |
| perimeter_se | 0.137331 | 0.556141 | 0.674172 | 0.281673 | 0.693135 | 0.726628 | 0.296092 | 0.548905 | 0.660391 | 0.710650 | ... | 0.697201 | 0.200371 | 0.721031 | 0.730713 | 0.130054 | 0.341919 | 0.418899 | 0.554897 | 0.109930 | 0.085433 |
| area_se | 0.177742 | 0.548236 | 0.735864 | 0.259845 | 0.744983 | 0.800086 | 0.246552 | 0.455653 | 0.617427 | 0.690299 | ... | 0.757373 | 0.196497 | 0.761213 | 0.811408 | 0.125389 | 0.283257 | 0.385100 | 0.538166 | 0.074126 | 0.017539 |
| smoothness_se | 0.096781 | -0.067016 | -0.222600 | 0.006614 | -0.202694 | -0.166777 | 0.332375 | 0.135299 | 0.098564 | 0.027653 | ... | -0.230691 | -0.074743 | -0.217304 | -0.182195 | 0.314457 | -0.055558 | -0.058298 | -0.102007 | -0.107342 | 0.101480 |
| compactness_se | 0.033961 | 0.292999 | 0.206000 | 0.191975 | 0.250744 | 0.212583 | 0.318943 | 0.738722 | 0.670279 | 0.490424 | ... | 0.204607 | 0.143003 | 0.260516 | 0.199371 | 0.227394 | 0.678780 | 0.639147 | 0.483208 | 0.277878 | 0.590973 |
| concavity_se | 0.055239 | 0.253730 | 0.194204 | 0.143293 | 0.228082 | 0.207660 | 0.248396 | 0.570517 | 0.691270 | 0.439167 | ... | 0.186904 | 0.100241 | 0.226680 | 0.188353 | 0.168481 | 0.484858 | 0.662564 | 0.440472 | 0.197788 | 0.439329 |
| concave points_se | 0.078768 | 0.408042 | 0.376169 | 0.163851 | 0.407217 | 0.372320 | 0.380676 | 0.642262 | 0.683260 | 0.615634 | ... | 0.358127 | 0.086741 | 0.394999 | 0.342271 | 0.215351 | 0.452888 | 0.549592 | 0.602450 | 0.143116 | 0.310655 |
| symmetry_se | -0.017306 | -0.006522 | -0.104321 | 0.009127 | -0.081629 | -0.072497 | 0.200774 | 0.229977 | 0.178009 | 0.095351 | ... | -0.128121 | -0.077473 | -0.103753 | -0.110343 | -0.012662 | 0.060255 | 0.037119 | -0.030413 | 0.389402 | 0.078079 |
| fractal_dimension_se | 0.025725 | 0.077972 | -0.042641 | 0.054458 | -0.005523 | -0.019887 | 0.283607 | 0.507318 | 0.449301 | 0.257584 | ... | -0.037488 | -0.003195 | -0.001000 | -0.022736 | 0.170568 | 0.390159 | 0.379975 | 0.215204 | 0.111094 | 0.591328 |
| radius_worst | 0.082405 | 0.776454 | 0.969539 | 0.352573 | 0.969476 | 0.962746 | 0.213120 | 0.535315 | 0.688236 | 0.830318 | ... | 1.000000 | 0.359921 | 0.993708 | 0.984015 | 0.216574 | 0.475820 | 0.573975 | 0.787424 | 0.243529 | 0.093492 |
| texture_worst | 0.064720 | 0.456903 | 0.297008 | 0.912045 | 0.303038 | 0.287489 | 0.036072 | 0.248133 | 0.299879 | 0.292752 | ... | 0.359921 | 1.000000 | 0.365098 | 0.345842 | 0.225429 | 0.360832 | 0.368366 | 0.359755 | 0.233027 | 0.219122 |
| perimeter_worst | 0.079986 | 0.782914 | 0.965137 | 0.358040 | 0.970387 | 0.959120 | 0.238853 | 0.590210 | 0.729565 | 0.855923 | ... | 0.993708 | 0.365098 | 1.000000 | 0.977578 | 0.236775 | 0.529408 | 0.618344 | 0.816322 | 0.269493 | 0.138957 |
| area_worst | 0.107187 | 0.733825 | 0.941082 | 0.343546 | 0.941550 | 0.959213 | 0.206718 | 0.509604 | 0.675987 | 0.809630 | ... | 0.984015 | 0.345842 | 0.977578 | 1.000000 | 0.209145 | 0.438296 | 0.543331 | 0.747419 | 0.209146 | 0.079647 |
| smoothness_worst | 0.010338 | 0.421465 | 0.119616 | 0.077503 | 0.150549 | 0.123523 | 0.805324 | 0.565541 | 0.448822 | 0.452753 | ... | 0.216574 | 0.225429 | 0.236775 | 0.209145 | 1.000000 | 0.568187 | 0.518523 | 0.547691 | 0.493838 | 0.617624 |
| compactness_worst | -0.002968 | 0.590998 | 0.413463 | 0.277830 | 0.455774 | 0.390410 | 0.472468 | 0.865809 | 0.754968 | 0.667454 | ... | 0.475820 | 0.360832 | 0.529408 | 0.438296 | 0.568187 | 1.000000 | 0.892261 | 0.801080 | 0.614441 | 0.810455 |
| concavity_worst | 0.023203 | 0.659610 | 0.526911 | 0.301025 | 0.563879 | 0.512606 | 0.434926 | 0.816275 | 0.884103 | 0.752399 | ... | 0.573975 | 0.368366 | 0.618344 | 0.543331 | 0.518523 | 0.892261 | 1.000000 | 0.855434 | 0.532520 | 0.686511 |
| concave points_worst | 0.035174 | 0.793566 | 0.744214 | 0.295316 | 0.771241 | 0.722017 | 0.503053 | 0.815573 | 0.861323 | 0.910155 | ... | 0.787424 | 0.359755 | 0.816322 | 0.747419 | 0.547691 | 0.801080 | 0.855434 | 1.000000 | 0.502528 | 0.511114 |
| symmetry_worst | -0.044224 | 0.416294 | 0.163953 | 0.105008 | 0.189115 | 0.143570 | 0.394309 | 0.510223 | 0.409464 | 0.375744 | ... | 0.243529 | 0.233027 | 0.269493 | 0.209146 | 0.493838 | 0.614441 | 0.532520 | 0.502528 | 1.000000 | 0.537848 |
| fractal_dimension_worst | -0.029866 | 0.323872 | 0.007066 | 0.119205 | 0.051019 | 0.003738 | 0.499316 | 0.687382 | 0.514930 | 0.368661 | ... | 0.093492 | 0.219122 | 0.138957 | 0.079647 | 0.617624 | 0.810455 | 0.686511 | 0.511114 | 0.537848 | 1.000000 |
32 rows × 32 columns
In [18]:
#Heatmap
correlation_matrix = bc_data.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, linewidths=.5)
plt.title('Correlation Matrix Heatmap')
# Show the plot
plt.show()
Handling Missing Values¶
In [19]:
missing_values = bc_data.isnull().sum()
missing_values
Out[19]:
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 concave points_mean 0 symmetry_mean 0 fractal_dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 concave points_se 0 symmetry_se 0 fractal_dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 concave points_worst 0 symmetry_worst 0 fractal_dimension_worst 0 dtype: int64
Data Cleaning¶
In [20]:
#Dropping Irrelevant columns
bc_data.drop('id', axis=1, inplace=True)
In [21]:
# Checking for Duplicates
duplicates = bc_data.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')
Number of duplicate rows: 0
Outlier Detection¶
In [22]:
X = bc_data.drop(columns=['diagnosis'])
In [23]:
# Boxplot before outlier removal
plt.figure(figsize=(16, 12))
sns.boxplot(data=X)
plt.xticks(rotation=90)
plt.title('Boxplot of Features Before Outlier Removal')
plt.savefig('boxplot_before_outlier_removal.png', dpi=300, bbox_inches='tight')
plt.show()
Applying Isolation Forest Algorithm¶
In [24]:
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(X)
bc_data['outlier'] = outliers
# Separating the non-outliers
bc_data_cleaned = bc_data[bc_data['outlier'] != -1].drop(columns=['outlier'])
In [25]:
# Boxplot after outlier removal
X_no_outliers = bc_data_cleaned.drop(columns=['diagnosis'])
plt.figure(figsize=(16, 12))
sns.boxplot(data=X_no_outliers)
plt.xticks(rotation=90)
plt.title('Boxplot of Features After Outlier Removal')
plt.savefig('boxplot_after_outlier_removal.png', dpi=300, bbox_inches='tight')
plt.show()
In [26]:
# cleaned dataset
print(bc_data_cleaned.head())
diagnosis radius_mean texture_mean perimeter_mean area_mean \ 1 1 20.57 17.77 132.90 1326.0 2 1 19.69 21.25 130.00 1203.0 4 1 20.29 14.34 135.10 1297.0 5 1 12.45 15.70 82.57 477.1 6 1 18.25 19.98 119.60 1040.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 1 0.08474 0.07864 0.0869 0.07017 2 0.10960 0.15990 0.1974 0.12790 4 0.10030 0.13280 0.1980 0.10430 5 0.12780 0.17000 0.1578 0.08089 6 0.09463 0.10900 0.1127 0.07400 symmetry_mean ... radius_worst texture_worst perimeter_worst \ 1 0.1812 ... 24.99 23.41 158.8 2 0.2069 ... 23.57 25.53 152.5 4 0.1809 ... 22.54 16.67 152.2 5 0.2087 ... 15.47 23.75 103.4 6 0.1794 ... 22.88 27.66 153.2 area_worst smoothness_worst compactness_worst concavity_worst \ 1 1956.0 0.1238 0.1866 0.2416 2 1709.0 0.1444 0.4245 0.4504 4 1575.0 0.1374 0.2050 0.4000 5 741.6 0.1791 0.5249 0.5355 6 1606.0 0.1442 0.2576 0.3784 concave points_worst symmetry_worst fractal_dimension_worst 1 0.1860 0.2750 0.08902 2 0.2430 0.3613 0.08758 4 0.1625 0.2364 0.07678 5 0.1741 0.3985 0.12440 6 0.1932 0.3063 0.08368 [5 rows x 31 columns]
In [27]:
bc_data.to_csv('C://Users/chand/Downloads/pre-processed-Data.csv', index=False)
In [28]:
print(bc_data.head())
diagnosis radius_mean texture_mean perimeter_mean area_mean \ 0 1 17.99 10.38 122.80 1001.0 1 1 20.57 17.77 132.90 1326.0 2 1 19.69 21.25 130.00 1203.0 3 1 11.42 20.38 77.58 386.1 4 1 20.29 14.34 135.10 1297.0 smoothness_mean compactness_mean concavity_mean concave points_mean \ 0 0.11840 0.27760 0.3001 0.14710 1 0.08474 0.07864 0.0869 0.07017 2 0.10960 0.15990 0.1974 0.12790 3 0.14250 0.28390 0.2414 0.10520 4 0.10030 0.13280 0.1980 0.10430 symmetry_mean ... texture_worst perimeter_worst area_worst \ 0 0.2419 ... 17.33 184.60 2019.0 1 0.1812 ... 23.41 158.80 1956.0 2 0.2069 ... 25.53 152.50 1709.0 3 0.2597 ... 26.50 98.87 567.7 4 0.1809 ... 16.67 152.20 1575.0 smoothness_worst compactness_worst concavity_worst concave points_worst \ 0 0.1622 0.6656 0.7119 0.2654 1 0.1238 0.1866 0.2416 0.1860 2 0.1444 0.4245 0.4504 0.2430 3 0.2098 0.8663 0.6869 0.2575 4 0.1374 0.2050 0.4000 0.1625 symmetry_worst fractal_dimension_worst outlier 0 0.4601 0.11890 -1 1 0.2750 0.08902 1 2 0.3613 0.08758 1 3 0.6638 0.17300 -1 4 0.2364 0.07678 1 [5 rows x 32 columns]
Feature Scaling¶
In [29]:
scaler = StandardScaler()
features = bc_data.drop(columns=[ 'diagnosis'])
scaled_features = scaler.fit_transform(features)
In [30]:
# Convert the scaled features back to a DataFrame
scaled_features_df = pd.DataFrame(scaled_features, columns=features.columns)
Feature Selection using RFE¶
In [31]:
model = RandomForestClassifier(random_state = 42)
rfe = RFE(estimator=model, n_features_to_select=10)
rfe = rfe.fit(scaled_features_df, bc_data['diagnosis'])
In [32]:
# Get the selected features
selected_features = scaled_features_df.columns[rfe.support_]
selected_features
Out[32]:
Index(['perimeter_mean', 'area_mean', 'concavity_mean', 'concave points_mean',
'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
'concavity_worst', 'concave points_worst'],
dtype='object')
In [33]:
# Extract the selected features from the previous RFE process
selected_features = scaled_features_df.columns[rfe.support_]
# Prepare the feature importance data for the selected features
selected_importances = rfe.estimator_.feature_importances_
selected_feature_importance_df = pd.DataFrame({
'Feature': selected_features,
'Importance': selected_importances
}).sort_values(by='Importance', ascending=False)
# Horizontal Bar Plot of Selected Feature Importances
plt.figure(figsize=(14, 6))
sns.barplot(x='Importance', y='Feature', data=selected_feature_importance_df, palette='viridis', orient='h')
plt.title('Selected Feature Importances after RFE', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)
plt.show()
Feature Engineering using Polynomial Feature¶
In [34]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(scaled_features_df[selected_features])
# Convert the polynomial features to a DataFrame
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(selected_features))
In [35]:
# Combining the polynomial features and target variable into one DataFrame
data_poly = pd.concat([X_poly_df, bc_data['diagnosis'].reset_index(drop=True)], axis=1)
data_poly
Out[35]:
| perimeter_mean | area_mean | concavity_mean | concave points_mean | radius_worst | texture_worst | perimeter_worst | area_worst | concavity_worst | concave points_worst | ... | perimeter_worst area_worst | perimeter_worst concavity_worst | perimeter_worst concave points_worst | area_worst^2 | area_worst concavity_worst | area_worst concave points_worst | concavity_worst^2 | concavity_worst concave points_worst | concave points_worst^2 | diagnosis | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.269934 | 0.984375 | 2.652874 | 2.532475 | 1.886690 | -1.359293 | 2.303601 | 2.001237 | 2.109526 | 2.296076 | ... | 4.610052 | 4.859506 | 5.289242 | 4.004951 | 4.221663 | 4.594994 | 4.450101 | 4.843633 | 5.271966 | 1 |
| 1 | 1.685955 | 1.908708 | -0.023846 | 0.548144 | 1.805927 | -0.369203 | 1.535126 | 1.890489 | -0.146749 | 1.087084 | ... | 2.902139 | -0.225278 | 1.668811 | 3.573949 | -0.277427 | 2.055121 | 0.021535 | -0.159528 | 1.181752 | 1 |
| 2 | 1.566503 | 1.558884 | 1.363478 | 2.037231 | 1.511870 | -0.023974 | 1.347475 | 1.456285 | 0.854974 | 1.955000 | ... | 1.962307 | 1.152056 | 2.634315 | 2.120765 | 1.245085 | 2.847037 | 0.730980 | 1.671474 | 3.822026 | 1 |
| 3 | -0.592687 | -0.764464 | 1.915897 | 1.451707 | -0.281464 | 0.133984 | -0.249939 | -0.550021 | 1.989588 | 2.175786 | ... | 0.137472 | -0.497276 | -0.543814 | 0.302523 | -1.094316 | -1.196728 | 3.958461 | 4.328918 | 4.734045 | 1 |
| 4 | 1.776573 | 1.826229 | 1.371011 | 1.428493 | 1.298575 | -1.466770 | 1.338539 | 1.220724 | 0.613179 | 0.729259 | ... | 1.633988 | 0.820764 | 0.976142 | 1.490168 | 0.748522 | 0.890224 | 0.375988 | 0.447166 | 0.531819 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 564 | 2.060786 | 2.343856 | 1.947285 | 2.320965 | 1.901185 | 0.117700 | 1.752563 | 2.015301 | 0.664512 | 1.629151 | ... | 3.531941 | 1.164599 | 2.855189 | 4.061437 | 1.339192 | 3.283230 | 0.441577 | 1.082591 | 2.654134 | 1 |
| 565 | 1.615931 | 1.723842 | 0.693043 | 1.263669 | 1.536720 | 2.047399 | 1.421940 | 1.494959 | 0.236573 | 0.733827 | ... | 2.125741 | 0.336393 | 1.043458 | 2.234901 | 0.353667 | 1.097041 | 0.055967 | 0.173604 | 0.538502 | 1 |
| 566 | 0.672676 | 0.577953 | 0.046588 | 0.105777 | 0.561361 | 1.374854 | 0.579001 | 0.427906 | 0.326767 | 0.414069 | ... | 0.247758 | 0.189198 | 0.239746 | 0.183103 | 0.139825 | 0.177182 | 0.106776 | 0.135304 | 0.171453 | 1 |
| 567 | 1.982524 | 1.735218 | 3.296944 | 2.658866 | 1.961239 | 2.237926 | 2.303601 | 1.653171 | 3.197605 | 2.289985 | ... | 3.808245 | 7.366004 | 5.275212 | 2.732974 | 5.286187 | 3.785737 | 10.224676 | 7.322468 | 5.244034 | 1 |
| 568 | -1.814389 | -1.347789 | -1.114873 | -1.261820 | -1.410893 | 0.764190 | -1.432735 | -1.075813 | -1.305831 | -1.745063 | ... | 1.541355 | 1.870909 | 2.500212 | 1.157373 | 1.404829 | 1.877361 | 1.705194 | 2.278757 | 3.045244 | 0 |
569 rows × 66 columns
In [36]:
# Plot pairplot for a subset of polynomial features
import matplotlib.patches as mpatches
sns.pairplot(data_poly[['perimeter_worst', 'area_worst', 'perimeter_worst^2', 'area_worst^2', 'diagnosis']], hue='diagnosis')
plt.savefig('pairplot.png', dpi=300, bbox_inches='tight')
plt.show()
Data Balancing¶
In [37]:
# Separate the majority and minority classes
majority_class = data_poly[data_poly['diagnosis'] == 0]
minority_class = data_poly[data_poly['diagnosis'] == 1]
# Over-sample the minority class
minority_class_over = resample(minority_class,
replace=True,
n_samples=len(majority_class),
random_state=42)
# Combine the majority class with the over-sampled minority class
data_balanced_over = pd.concat([majority_class, minority_class_over])
# Split the balanced dataset into features and target variable
X_balanced_over = data_balanced_over.drop(columns=['diagnosis'])
y_balanced_over = data_balanced_over['diagnosis']
# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced_over, y_balanced_over, test_size=0.3, random_state=42)
print(y_train.value_counts())
diagnosis 1 259 0 240 Name: count, dtype: int64
In [38]:
# Distribution of the target variable before and after balancing
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
sns.countplot(x=bc_data['diagnosis'], palette='viridis')
plt.title('Distribution of Target Variable Before Balancing')
plt.xlabel('Diagnosis')
plt.ylabel('Count')
plt.subplot(1, 2, 2)
sns.countplot(x=y_balanced_over, palette='viridis')
plt.title('Distribution of Target Variable After Balancing')
plt.xlabel('Diagnosis')
plt.ylabel('Count')
plt.tight_layout()
plt.show()
Split Data¶
In [39]:
from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced_over, y_balanced_over, test_size=0.3, random_state=42)
print("Shape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)
Shape of training data: (499, 65) Shape of testing data: (215, 65)
Support Vector Machine (SVM)¶
In [40]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
# Set random seed for reproducibility
np.random.seed(42)
# Define the parameter grid for SVM
param_grid_svm = {
'C': [0.1, 1, 10, 100],
'gamma': [1, 0.1, 0.01, 0.001],
'kernel': ['rbf', 'linear']
}
# Initialize and fit the grid search
grid_search = GridSearchCV(estimator=SVC(probability=True, random_state=42),
param_grid=param_grid_svm, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)
# Print best parameters and best cross-validation accuracy
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")
# Getting the best SVM model
best_svm_model = grid_search.best_estimator_
# Evaluate the best model on the training data
y_train_pred = best_svm_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)
# Evaluate the best model on the testing data
y_test_pred = best_svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Testing Accuracy:", test_accuracy)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'C': 100, 'gamma': 1, 'kernel': 'linear'}
Best cross-validation accuracy: 0.9720
Training Accuracy: 1.0
Testing Accuracy: 0.9395348837209302
In [41]:
# Print classification report and confusion matrix for test data
print("SVM Classification Report:\n", classification_report(y_test, y_test_pred))
# Calculate confusion matrix and sensitivity (recall) and specificity
conf_matrix_svm = confusion_matrix(y_test, y_test_pred)
sensitivity_svm = conf_matrix_svm[1, 1] / (conf_matrix_svm[1, 0] + conf_matrix_svm[1, 1])
specificity_svm = conf_matrix_svm[0, 0] / (conf_matrix_svm[0, 0] + conf_matrix_svm[0, 1])
# Print sensitivity, specificity, and confusion matrix
print("Sensitivity (Recall) for Class 1:", sensitivity_svm)
print("Specificity (True Negative Rate) for Class 0:", specificity_svm)
SVM Classification Report:
precision recall f1-score support
0 0.96 0.93 0.94 117
1 0.92 0.95 0.93 98
accuracy 0.94 215
macro avg 0.94 0.94 0.94 215
weighted avg 0.94 0.94 0.94 215
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9316239316239316
In [42]:
# Plot confusion matrix
disp_svm = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_svm)
disp_svm.plot(cmap='Blues')
plt.title('SVM Confusion Matrix')
plt.show()
In [43]:
# Predict probabilities and compute ROC curve and AUC
y_prob_svm = best_svm_model.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob_svm)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='orange', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for SVM')
plt.legend(loc="lower right")
plt.show()
#LIME¶
In [44]:
import lime
import lime.lime_tabular
In [45]:
# Create a LIME explainer
explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X_train.values,
feature_names=X_train.columns,
class_names=['Benign', 'Malignant'],
mode='classification'
)
# Explain a prediction
i = 0 # Change this index to explain different predictions
exp = explainer.explain_instance(
data_row=X_test.values[i],
predict_fn=best_svm_model.predict_proba
)
In [46]:
# Show the explanation
exp.show_in_notebook(show_table=True, show_all=False)
In [47]:
fig = exp.as_pyplot_figure()
plt.show()
Multi-Layer Perceptron (MLP)¶
In [48]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
# Set random seed for reproducibility
np.random.seed(42)
# Train MLP model
mlp_model = MLPClassifier(random_state=42, max_iter=300)
mlp_model.fit(X_train, y_train)
# Predict and evaluate the MLP model
y_pred_mlp = mlp_model.predict(X_test)
In [49]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_mlp)
classification_report_mlp = classification_report(y_test, y_pred_mlp, output_dict=True)
# Calculate sensitivity (recall) and specificity
conf_matrix_mlp = confusion_matrix(y_test, y_pred_mlp)
sensitivity = conf_matrix_mlp[1, 1] / (conf_matrix_mlp[1, 0] + conf_matrix_mlp[1, 1])
specificity = conf_matrix_mlp[0, 0] / (conf_matrix_mlp[0, 0] + conf_matrix_mlp[0, 1])
In [52]:
# Print classification report, sensitivity, specificity, and accuracy
# Classification report
print("MLP Classification Report:")
print(classification_report(y_test, y_pred_mlp))
# Accuracy score
print("MLP Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)
# Print and display the confusion matrix
print(" MLP Confusion Matrix:")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_mlp)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
MLP Classification Report:
precision recall f1-score support
0 0.96 0.96 0.96 117
1 0.95 0.95 0.95 98
accuracy 0.95 215
macro avg 0.95 0.95 0.95 215
weighted avg 0.95 0.95 0.95 215
MLP Accuracy: 0.9534883720930233
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9572649572649573
MLP Confusion Matrix:
In [54]:
# Predict probabilities
y_prob_mlp = mlp_model.predict_proba(X_test)[:, 1]
# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob_mlp)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='green', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for MLP')
plt.legend(loc="lower right")
plt.show()
Random Forest¶
In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Set random seed for reproducibility
np.random.seed(42)
# Define the parameter grid for Random Forest
param_grid_rf = {
'n_estimators': [100, 200, 300], # Number of trees in the forest
'max_features': ['auto', 'sqrt'], # Number of features to consider at every split
'max_depth': [10, 20, 30, None], # Maximum depth of the tree
'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
'min_samples_leaf': [1, 2, 4], # Minimum number of samples required at a leaf node
'bootstrap': [True, False] # Whether bootstrap samples are used when building trees
}
# Initialize and fit the grid search
rf_grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)
rf_grid_search.fit(X_train, y_train)
# Print best parameters and best cross-validation accuracy
print(f"Best parameters: {rf_grid_search.best_params_}")
print(f"Best cross-validation accuracy: {rf_grid_search.best_score_:.4f}")
# Get the best Random Forest model
best_rf_model = rf_grid_search.best_estimator_
# Evaluate the best model on the training data
y_train_pred_rf = best_rf_model.predict(X_train)
train_accuracy_rf = accuracy_score(y_train, y_train_pred_rf)
print("Training Accuracy:", train_accuracy_rf)
# Evaluate the best model on the testing data
y_test_pred_rf = best_rf_model.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print("Testing Accuracy:", test_accuracy_rf)
Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best parameters: {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
Best cross-validation accuracy: 0.9860
Training Accuracy: 1.0
Testing Accuracy: 0.958139534883721
In [56]:
# Print classification report and confusion matrix for test data
print("Random Forest Classification Report:\n", classification_report(y_test, y_test_pred_rf))
# Calculate confusion matrix and sensitivity (recall) and specificity
conf_matrix_rf = confusion_matrix(y_test, y_test_pred_rf)
sensitivity_rf = conf_matrix_rf[1, 1] / (conf_matrix_rf[1, 0] + conf_matrix_rf[1, 1])
specificity_rf = conf_matrix_rf[0, 0] / (conf_matrix_rf[0, 0] + conf_matrix_rf[0, 1])
# Print sensitivity, specificity, and confusion matrix
print("Sensitivity (Recall) for Class 1:", sensitivity_rf)
print("Specificity (True Negative Rate) for Class 0:", specificity_rf)
Random Forest Classification Report:
precision recall f1-score support
0 0.96 0.97 0.96 117
1 0.96 0.95 0.95 98
accuracy 0.96 215
macro avg 0.96 0.96 0.96 215
weighted avg 0.96 0.96 0.96 215
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9658119658119658
In [57]:
# Plot confusion matrix
disp_rf = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_rf)
disp_rf.plot(cmap='Blues')
plt.title('Random Forest Confusion Matrix')
plt.show()
ROC Curve¶
In [58]:
# Get predicted probabilities for the positive class (class 1)
y_test_proba_rf = best_rf_model.predict_proba(X_test)[:, 1]
# Compute ROC curve and AUC score
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_test_proba_rf)
roc_auc_rf = roc_auc_score(y_test, y_test_proba_rf)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr_rf, tpr_rf, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc_rf:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--') # Diagonal line (random model)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Random Forest Model')
plt.legend(loc="lower right")
plt.grid()
plt.show()
XGBOOST¶
In [62]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc, classification_report, accuracy_score
In [63]:
# Define the parameter grid for XGBoost
param_grid_xgb = {
'objective': ['binary:logistic'],
'eval_metric': ['logloss'],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 6, 9],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 1.0],
'colsample_bytree': [0.8, 1.0]
}
# Initialize XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)
# Initialize GridSearchCV
grid_search_xgb = GridSearchCV(
estimator=xgb_model, param_grid=param_grid_xgb, scoring='accuracy', cv=5, n_jobs=-1, verbose=2)
# Fit GridSearchCV
grid_search_xgb.fit(X_train, y_train)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
Out[63]:
GridSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None,
learning_rate=None,...
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None,
random_state=42, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 1.0],
'eval_metric': ['logloss'],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 6, 9],
'n_estimators': [100, 200, 300],
'objective': ['binary:logistic'],
'subsample': [0.8, 1.0]},
scoring='accuracy', verbose=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5,
estimator=XGBClassifier(base_score=None, booster=None,
callbacks=None, colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None, device=None,
early_stopping_rounds=None,
enable_categorical=False, eval_metric=None,
feature_types=None, gamma=None,
grow_policy=None, importance_type=None,
interaction_constraints=None,
learning_rate=None,...
missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None,
random_state=42, ...),
n_jobs=-1,
param_grid={'colsample_bytree': [0.8, 1.0],
'eval_metric': ['logloss'],
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 6, 9],
'n_estimators': [100, 200, 300],
'objective': ['binary:logistic'],
'subsample': [0.8, 1.0]},
scoring='accuracy', verbose=2)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.1, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=100,
n_jobs=None, num_parallel_tree=None, random_state=42, ...)XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=1.0, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='logloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=0.1, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=100,
n_jobs=None, num_parallel_tree=None, random_state=42, ...)In [64]:
# Get the best XGBoost model
best_xgb_model = grid_search_xgb.best_estimator_
print(f"Best parameters: {grid_search_xgb.best_params_}")
print(f"Best cross-validation accuracy: {grid_search_xgb.best_score_:.4f}")
# Predict on the test set
y_pred_best_xgb = best_xgb_model.predict(X_test)
y_prob_best_xgb = best_xgb_model.predict_proba(X_test)[:, 1] # Probability estimates for ROC curve
accuracy = accuracy_score(y_test, y_pred_best_xgb)
print("XGBoost Accuracy:", accuracy)
Best parameters: {'colsample_bytree': 1.0, 'eval_metric': 'logloss', 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'objective': 'binary:logistic', 'subsample': 1.0}
Best cross-validation accuracy: 0.9800
XGBoost Accuracy: 0.9627906976744186
In [65]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best_xgb))
Classification Report:
precision recall f1-score support
0 0.97 0.97 0.97 117
1 0.96 0.96 0.96 98
accuracy 0.96 215
macro avg 0.96 0.96 0.96 215
weighted avg 0.96 0.96 0.96 215
In [66]:
# Compute confusion matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_best_xgb)
# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_xgb, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap='Blues')
plt.title(' XGBoost Confusion Matrix')
plt.show()
In [67]:
# Compute ROC curve
fpr, tpr, _ = roc_curve(y_test, y_prob_best_xgb)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()
XAI - SHAP¶
In [68]:
explainer = shap.Explainer(best_xgb_model, X_train)
shap_values = explainer(X_test)
In [69]:
shap.summary_plot(shap_values, X_test)
# Save the plot to a file
plt.savefig('xgboostplot.png', dpi=300, bbox_inches='tight')
plt.show()
<Figure size 640x480 with 0 Axes>
In [70]:
shap.dependence_plot("perimeter_worst", shap_values.values, X_test)
In [71]:
#force plot
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values.values[0], X_test.iloc[0, :])
Out[71]:
Visualization omitted, Javascript library not loaded!
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
Have you run `initjs()` in this notebook? If this notebook was from another user you must also trust this notebook (File -> Trust notebook). If you are viewing this notebook on github the Javascript has been stripped for security. If you are using JupyterLab this error is because a JupyterLab extension has not yet been written.
In [72]:
#waterfall plot
#Breaks down an individual prediction to show how each feature contributes to the final prediction.
shap.waterfall_plot(shap.Explanation(values=shap_values[0], base_values=explainer.expected_value, data=X_test.iloc[0]))
CATBOOST¶
In [73]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, ConfusionMatrixDisplay
# Train CatBoost model
catboost_model = CatBoostClassifier(random_state=42, verbose=0)
catboost_model.fit(X_train, y_train)
# Predict and evaluate the CatBoost model
y_pred_catboost = catboost_model.predict(X_test)
In [74]:
# Classification report
classification_report_catboost = classification_report(y_test, y_pred_catboost, output_dict=True)
# Calculate sensitivity (recall) and specificity
sensitivity = classification_report_catboost['1']['recall']
conf_matrix_catboost = confusion_matrix(y_test, y_pred_catboost)
specificity = conf_matrix_catboost[0, 0] / (conf_matrix_catboost[0, 0] + conf_matrix_catboost[0, 1])
accuracy = accuracy_score(y_test, y_pred_catboost)
In [75]:
# Print classification report, sensitivity, specificity, and accuracy
print("CatBoost Classification Report:")
print(classification_report(y_test, y_pred_catboost))
print("CatBoost Accuracy:", accuracy)
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)
# Print confusion matrix
print("Confusion Matrix:")
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_catboost)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
CatBoost Classification Report:
precision recall f1-score support
0 0.96 0.97 0.97 117
1 0.97 0.95 0.96 98
accuracy 0.96 215
macro avg 0.96 0.96 0.96 215
weighted avg 0.96 0.96 0.96 215
CatBoost Accuracy: 0.9627906976744186
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9743589743589743
Confusion Matrix:
In [76]:
# Predict probabilities
y_prob_catboost = catboost_model.predict_proba(X_test)[:, 1]
# Compute ROC curve and ROC area
fpr, tpr, _ = roc_curve(y_test, y_prob_catboost)
roc_auc = auc(fpr, tpr)
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for CatBoost')
plt.legend(loc="lower right")
plt.show()
SHAP - CATBOOST¶
In [77]:
explainer1 = shap.TreeExplainer(catboost_model)
shap_values1 = explainer.shap_values(X_test)
In [78]:
if isinstance(shap_values, list):
shap_values1 = shap_values[0]
In [79]:
# Summary plot
shap.summary_plot(shap_values1, X_test)
In [80]:
# Dependence plot for a specific feature
shap.dependence_plot("area_worst", shap_values1, X_test)
In [81]:
# Calculating SHAP interaction values
explainer_catboost = shap.TreeExplainer(catboost_model)
shap_interaction_values_catboost = explainer_catboost.shap_interaction_values(X_test)
In [82]:
# Interaction value matrix plot
shap.summary_plot(shap_interaction_values_catboost, X_test, plot_type="interaction")
Stacking Classifier - Meta-model¶
#Defining all the base models¶
In [107]:
from xgboost import XGBClassifier
# Define base models
base_models = [
('MLP', MLPClassifier(random_state=42, max_iter=300)),
('XGBoost', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')),
('CatBboost', CatBoostClassifier(random_state=42, verbose=0))
]
In [108]:
# Define meta-learner
meta_learner = LogisticRegression()
In [109]:
# Creating stacking classifier
stacking_clf = StackingClassifier(
estimators=base_models,
final_estimator=meta_learner,
cv=5 # Cross-validation folds for stacking
)
In [110]:
# Training the stacking classifier
stacking_clf.fit(X_train, y_train)
Out[110]:
StackingClassifier(cv=5,
estimators=[('MLP',
MLPClassifier(max_iter=300, random_state=42)),
('XGBoost',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric='mlogloss',
feature_types=None, gamma=None,
grow_poli...
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...)),
('CatBboost',
<catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>)],
final_estimator=LogisticRegression())In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StackingClassifier(cv=5,
estimators=[('MLP',
MLPClassifier(max_iter=300, random_state=42)),
('XGBoost',
XGBClassifier(base_score=None, booster=None,
callbacks=None,
colsample_bylevel=None,
colsample_bynode=None,
colsample_bytree=None,
device=None,
early_stopping_rounds=None,
enable_categorical=False,
eval_metric='mlogloss',
feature_types=None, gamma=None,
grow_poli...
max_cat_threshold=None,
max_cat_to_onehot=None,
max_delta_step=None,
max_depth=None, max_leaves=None,
min_child_weight=None,
missing=nan,
monotone_constraints=None,
multi_strategy=None,
n_estimators=None, n_jobs=None,
num_parallel_tree=None,
random_state=42, ...)),
('CatBboost',
<catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>)],
final_estimator=LogisticRegression())MLPClassifier(max_iter=300, random_state=42)
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric='mlogloss',
feature_types=None, gamma=None, grow_policy=None,
importance_type=None, interaction_constraints=None,
learning_rate=None, max_bin=None, max_cat_threshold=None,
max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
max_leaves=None, min_child_weight=None, missing=nan,
monotone_constraints=None, multi_strategy=None, n_estimators=None,
n_jobs=None, num_parallel_tree=None, random_state=42, ...)<catboost.core.CatBoostClassifier object at 0x0000029A741B6D10>
LogisticRegression()
In [111]:
# Predict and evaluate the stacking classifier
y_pred_stacking = stacking_clf.predict(X_test)
y_prob_stacking = stacking_clf.predict_proba(X_test)[:, 1]
In [112]:
# Classification report
print("Stacking Classifier Classification Report:")
print(classification_report(y_test, y_pred_stacking))
# Accuracy score
accuracy = accuracy_score(y_test, y_pred_stacking)
print("Stacking Classifier Accuracy:", accuracy)
# Calculate sensitivity (recall) and specificity
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
sensitivity = conf_matrix_stacking[1, 1] / (conf_matrix_stacking[1, 0] + conf_matrix_stacking[1, 1])
specificity = conf_matrix_stacking[0, 0] / (conf_matrix_stacking[0, 0] + conf_matrix_stacking[0, 1])
print("Sensitivity (Recall) for Class 1:", sensitivity)
print("Specificity (True Negative Rate) for Class 0:", specificity)
# Print the confusion matrix
print("Confusion Matrix:")
print(conf_matrix_stacking)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix_stacking)
disp.plot(cmap='Blues')
plt.title('Confusion Matrix')
plt.show()
Stacking Classifier Classification Report:
precision recall f1-score support
0 0.96 0.97 0.97 117
1 0.97 0.95 0.96 98
accuracy 0.96 215
macro avg 0.96 0.96 0.96 215
weighted avg 0.96 0.96 0.96 215
Stacking Classifier Accuracy: 0.9627906976744186
Sensitivity (Recall) for Class 1: 0.9489795918367347
Specificity (True Negative Rate) for Class 0: 0.9743589743589743
Confusion Matrix:
[[114 3]
[ 5 93]]
In [113]:
# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='purple', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Stacking Classifier')
plt.legend(loc="lower right")
plt.show()
PDP¶
In [114]:
from sklearn.inspection import PartialDependenceDisplay
In [115]:
# For a single feature
PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [2], feature_names=X_train.columns)
plt.show()
In [116]:
# Define colors for each feature
colors = ['red', 'green', 'blue']
# Create PDP plots with different colors
fig, axs = plt.subplots(1, len(features), figsize=(15, 5))
features = [6, 3, 7] # Indices of features to plot
for i, (feature, color) in enumerate(zip(features, colors)):
PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [feature], feature_names=X_train.columns, ax=axs[i], line_kw={'color': color})
plt.tight_layout()
plt.show()
In [117]:
# For interaction between two features
PartialDependenceDisplay.from_estimator(stacking_clf, X_train, [(6, 3)], feature_names=X_train.columns)
plt.show()
In [118]:
import lime
from lime.lime_tabular import LimeTabularExplainer
explainer = LimeTabularExplainer(X_train.values, feature_names=X_train.columns, class_names=['0', '1'], mode='classification')
lime_exp = explainer.explain_instance(X_test.iloc[0].values, stacking_clf.predict_proba)
lime_exp.show_in_notebook()